pacman::p_load(tidyverse, data.table, broom,latex2exp, ggpubr, kableExtra, 
               R.utils, sf, raster, modelsummary, stringi, plm)
rm(list=ls())
options("modelsummary_format_numeric_latex" = "plain")
`%!in%` = Negate(`%in%`)
################################################################################


########################### Destination-level

for(spatial_unit in c('amc_id','microregion_id','mesoregion_id','state')){
  
  id_type_list = c('county_known')
  
  df = fread("../../data/agribusiness/clean/beef_brazil.csv.gz")
  prod_list = df[, lapply(.SD, sum), by=.(product_type), .SDcols=c('equivalent_tonnes','fob_usd')][, c('fob_usd_share','equivalent_tonnes_share') := list(fob_usd/sum(df$fob_usd),equivalent_tonnes/sum(df$equivalent_tonnes))][order(-fob_usd_share),]
  df_b_br = df[product_type %in% c("BEEF BONELESS") & id_type %in% id_type_list & destination!='BRAZIL',]
  
  df = fread("../../data/agribusiness/clean/soybean_brazil.csv.gz")
  prod_list = df[, lapply(.SD, sum), by=.(product_type), .SDcols=c('equivalent_tonnes','fob_usd')][, c('fob_usd_share','equivalent_tonnes_share') := list(fob_usd/sum(df$fob_usd),equivalent_tonnes/sum(df$equivalent_tonnes))][order(-fob_usd_share),]
  df_s_br = df[product_type %in% c("Soy bean equivalents") & id_type %in% id_type_list & destination!='BRAZIL',]
  
  df = fread("../../data/agribusiness/clean/maize_brazil.csv.gz")
  prod_list = df[, lapply(.SD, sum), by=.(product_type), .SDcols=c('equivalent_tonnes','fob_usd')][, c('fob_usd_share','equivalent_tonnes_share') := list(fob_usd/sum(df$fob_usd),equivalent_tonnes/sum(df$equivalent_tonnes))][order(-fob_usd_share),]
  df_m_br = df[product_type %in% c("Corn equivalents") & id_type %in% id_type_list & destination!='BRAZIL',]
  
  df = fread("../../data/agribusiness/clean/soybean_argentina.csv.gz")
  prod_list = df[, lapply(.SD, sum), by=.(product_type), .SDcols=c('equivalent_tonnes','fob_usd')][, c('fob_usd_share','equivalent_tonnes_share') := list(fob_usd/sum(df$fob_usd),equivalent_tonnes/sum(df$equivalent_tonnes))][order(-fob_usd_share),]
  df_s_ar = df[product_type %in% c("SOYBEANS") & id_type %in% id_type_list,]
  
  #Market shares
  if(spatial_unit=='amc_id'){
    df_b_br$spatial_id = df_b_br$amc_id
    df_s_br$spatial_id = df_s_br$amc_id
    df_m_br$spatial_id = df_m_br$amc_id
    df_s_ar$spatial_id = df_s_ar$county_id}
  
  if(spatial_unit=='microregion_id'){
    df_b_br$spatial_id = df_b_br$microregion_id
    df_s_br$spatial_id = df_s_br$microregion_id
    df_m_br$spatial_id = df_m_br$microregion_id
    df_s_ar$spatial_id = df_s_ar$county_id}
  
  if(spatial_unit=='mesoregion_id'){
    df_b_br$spatial_id = df_b_br$mesoregion_id
    df_s_br$spatial_id = df_s_br$mesoregion_id
    df_m_br$spatial_id = df_m_br$mesoregion_id
    df_s_ar$spatial_id = df_s_ar$county_id}
  
  if(spatial_unit=='state'){
    df_b_br$spatial_id = df_b_br$state
    df_s_br$spatial_id = df_s_br$state
    df_m_br$spatial_id = df_m_br$state
    df_s_ar$spatial_id = df_s_ar$state}
  
  var_list = c('year','country','commodity','product_type','spatial_id','state',
               'exporter','exporter_group',
               'destination','destination_bloc',
               'equivalent_tonnes','fob_usd')
  
  df = rbind(df_b_br[destination!='BRAZIL', ..var_list],
             df_s_br[destination!='BRAZIL', ..var_list],
             df_m_br[destination!='BRAZIL', ..var_list],
             df_s_ar[id_type %in% id_type_list, ..var_list])
  
  df[destination %in% c('CHINA (MAINLAND)','CHINA (HONG KONG)'),]$destination = 'CHINA'
  exporter_drop_list = c('UNKNOWN')
  exporter_group_drop_list = c('DOMESTIC CONSUMPTION','UNKNOWN','PROCESSED DOMESTICALLY')
  destination_drop_list = c('UNKNOWN COUNTRY')
  state_drop_list = c("UNKNOWN STATE","UNKNOWN STATE","IMPORTS + STOCK")
  df = df[!is.na(spatial_id),][destination %!in% destination_drop_list,][exporter %!in% exporter_drop_list,][exporter_group %!in% exporter_group_drop_list,][state %!in% state_drop_list,]
  df$destination_market = df$destination_bloc
  df_main = df
  
  #Nationwide shares
  df_x = df_main[, lapply(.SD, sum), by=.(year, country, commodity, destination_market), .SDcols=c('equivalent_tonnes','fob_usd')]
  df_y = df_main[, lapply(.SD, sum), by=.(year, country, commodity), .SDcols=c('equivalent_tonnes','fob_usd')]
  df = merge(df_x, df_y, by=c('year','country','commodity'))
  df[, c('share_q','share_v') := list( equivalent_tonnes.x/equivalent_tonnes.y, fob_usd.x/fob_usd.y ) ]
  df_shares_nation = df[, list(year, country, commodity, destination_market, share_q, share_v)]  %>% group_by(year, country, commodity) %>% top_n(3, share_q)
  
  #Local shares
  df_x = df_main[, lapply(.SD, sum), by=.(year, country, commodity, spatial_id, destination_market), .SDcols=c('equivalent_tonnes','fob_usd')]
  df_y = df_main[, lapply(.SD, sum), by=.(year, country, commodity, spatial_id), .SDcols=c('equivalent_tonnes','fob_usd')]
  df = merge(df_x, df_y, by=c('year','country','commodity','spatial_id'))
  df[, c('share_q','share_v') := list( equivalent_tonnes.x/equivalent_tonnes.y, fob_usd.x/fob_usd.y ) ]
  df_shares = df[, list(year, country, commodity, spatial_id, destination_market, share_q,share_v)]
  df_locations = unique(df_shares[,c('country','spatial_id')], by=c('country','spatial_id'))
  
  #Panel
  df1=complete(df_shares[commodity=='beef', c('year','destination_market','spatial_id')], year, destination_market, spatial_id)
  df1$commodity='beef'
  df2=complete(df_shares[commodity=='maize',c('year','destination_market','spatial_id')], year, destination_market, spatial_id)
  df2$commodity='maize'
  df3=complete(df_shares[commodity=='soybean' & country=='BRAZIL',c('year','destination_market','spatial_id')], year, destination_market, spatial_id)
  df3$commodity='soybean'
  df4=complete(df_shares[commodity=='soybean' & country=='ARGENTINA',c('year','destination_market','spatial_id')], year, destination_market, spatial_id)
  df4$commodity='soybean'
  df=rbind(df1,df2,df3,df4)
  
  df_combinations = merge(df, df_locations, by='spatial_id')
  df=setDT(merge(df_combinations, df_shares, by=c('year','commodity','country','spatial_id','destination_market'), all.x=T))
  df[is.na(share_q),]$share_q = 0
  df[is.na(share_v),]$share_v = 0
  df$currently_present = 'no'
  df[share_q>0,]$currently_present = 'yes'
  df_panel = df
  
  df = df %>% dplyr::group_by(commodity,destination_market,spatial_id) %>% dplyr::summarise(max = max(share_q), .groups = "drop_last")
  df$presence = 'never present'
  df[df$max>0,]$presence = 'present at least once'
  df = setDT(df)[,-c('max')]
  
  df_panel = merge(df_panel,df, by=c('commodity','spatial_id','destination_market'))
  write.csv(df_panel, gzfile(paste0("../../data/agribusiness/clean/persistence_destinations_",spatial_unit,".csv.gz")), row.names = FALSE)

}

#### Regressions

for(spatial_unit in c('amc_id','microregion_id','mesoregion_id','state')){
  
  df = fread(paste0("../../data/agribusiness/clean/persistence_destinations_",spatial_unit,".csv.gz"))
  df$dummy_present = 1*(df$currently_present=='yes')
  df$id = paste0(df$country, df$commodity, df$spatial_id, df$destination_market)
  pdata <- pdata.frame(df, index = c("id", "year"), drop.index = TRUE)
  
  #Shares
  m_share_q_p <- plm(share_q ~ lag(share_q, 1), data = pdata, model = "pooling")
  robust_vcov <- vcovHC(m_share_q_p, type = 'HC1')
  m_share_q_p$vcov <- robust_vcov
  
  #Presence
  m_presence_q_p <- plm(dummy_present ~ lag(dummy_present, 1), data = pdata, model = "pooling")
  robust_vcov <- vcovHC(m_presence_q_p, type = 'HC1')
  m_presence_q_p$vcov <- robust_vcov

  if(spatial_unit=='amc_id'){
    m_share_q_p_amc_id = m_share_q_p
    m_presence_q_p_amc_id = m_presence_q_p}
  
  if(spatial_unit=='microregion_id'){
    m_share_q_p_microregion_id = m_share_q_p
    m_presence_q_p_microregion_id = m_presence_q_p}
  
  if(spatial_unit=='mesoregion_id'){
    m_share_q_p_mesoregion_id = m_share_q_p
    m_presence_q_p_mesoregion_id = m_presence_q_p}

  if(spatial_unit=='state'){
    m_share_q_p_state = m_share_q_p
    m_presence_q_p_state = m_presence_q_p}
}

#### Summary table - shares
tab_title = paste0("Persistence of a destination's market share in an upstream market.")
rows <- tribble(~term,  ~OLS, ~OLS,~OLS,~OLS,
                'Upstream market definition', 'AMC/County','Mesoregion','Microregion','State/Province')
attr(rows, 'position') <- c(3)
gm <- tibble::tribble(~raw,        ~clean,          ~fmt,
                      "nobs",      "Observations",  0,
                      "r.squared", "R2", 3)
results_table_s <- msummary(list(m_share_q_p_amc_id,m_share_q_p_microregion_id,m_share_q_p_mesoregion_id,m_share_q_p_state),
                          coef_rename =  c('$\\rho$'),
                          coef_omit = 'Intercept',
                          title = paste0('\\label{tab: destinations - persistence - shares}',tab_title),
                          estimate = "{estimate}{stars}", gof_map = gm, escape=F,
                          add_rows = rows,
                          output='latex')
kableExtra::save_kable(results_table_s, file = "../../output/tables/appendix/table4_destinationshares.tex")

#### Summary table - presence
tab_title = paste0("Persistence of a destination's presence in an upstream market.")
rows <- tribble(~term,  ~OLS, ~OLS,~OLS,~OLS,
                'Upstream market definition', 'AMC/County','Mesoregion','Microregion','State/Province')
attr(rows, 'position') <- c(3)
gm <- tibble::tribble(~raw,        ~clean,          ~fmt,
                      "nobs",      "Observations",  0,
                      "r.squared", "R2", 3)
results_table_p <- msummary(list(m_presence_q_p_amc_id,m_presence_q_p_microregion_id,m_presence_q_p_mesoregion_id,m_presence_q_p_state),
                          coef_rename =  c('$\\rho$'),
                          coef_omit = 'Intercept',
                          title = paste0('\\label{tab: destinations - persistence - presence}',tab_title),
                          estimate = "{estimate}{stars}", gof_map = gm, escape=F,
                          add_rows = rows,
                          output='latex')
kableExtra::save_kable(results_table_p, file = "../../output/tables/appendix/table5_destinationpresence.tex")



########################### Firm-level 

for(spatial_unit in c('amc_id','microregion_id','mesoregion_id','state')){
  
  id_type_list = c('county_known')
  
  df = fread("../../data/agribusiness/clean/beef_brazil.csv.gz")
  prod_list = df[, lapply(.SD, sum), by=.(product_type), .SDcols=c('equivalent_tonnes','fob_usd')][, c('fob_usd_share','equivalent_tonnes_share') := list(fob_usd/sum(df$fob_usd),equivalent_tonnes/sum(df$equivalent_tonnes))][order(-fob_usd_share),]
  df_b_br = df[product_type %in% c("BEEF BONELESS") & id_type %in% id_type_list & destination!='BRAZIL',]
  
  df = fread("../../data/agribusiness/clean/soybean_brazil.csv.gz")
  prod_list = df[, lapply(.SD, sum), by=.(product_type), .SDcols=c('equivalent_tonnes','fob_usd')][, c('fob_usd_share','equivalent_tonnes_share') := list(fob_usd/sum(df$fob_usd),equivalent_tonnes/sum(df$equivalent_tonnes))][order(-fob_usd_share),]
  df_s_br = df[product_type %in% c("Soy bean equivalents") & id_type %in% id_type_list & destination!='BRAZIL',]
  
  df = fread("../../data/agribusiness/clean/maize_brazil.csv.gz")
  prod_list = df[, lapply(.SD, sum), by=.(product_type), .SDcols=c('equivalent_tonnes','fob_usd')][, c('fob_usd_share','equivalent_tonnes_share') := list(fob_usd/sum(df$fob_usd),equivalent_tonnes/sum(df$equivalent_tonnes))][order(-fob_usd_share),]
  df_m_br = df[product_type %in% c("Corn equivalents") & id_type %in% id_type_list & destination!='BRAZIL',]
  
  df = fread("../../data/agribusiness/clean/soybean_argentina.csv.gz")
  prod_list = df[, lapply(.SD, sum), by=.(product_type), .SDcols=c('equivalent_tonnes','fob_usd')][, c('fob_usd_share','equivalent_tonnes_share') := list(fob_usd/sum(df$fob_usd),equivalent_tonnes/sum(df$equivalent_tonnes))][order(-fob_usd_share),]
  df_s_ar = df[product_type %in% c("SOYBEANS") & id_type %in% id_type_list,]
  
  #Market shares
  if(spatial_unit=='amc_id'){
    df_b_br$spatial_id = df_b_br$amc_id
    df_s_br$spatial_id = df_s_br$amc_id
    df_m_br$spatial_id = df_m_br$amc_id
    df_s_ar$spatial_id = df_s_ar$county_id}
  
  if(spatial_unit=='microregion_id'){
    df_b_br$spatial_id = df_b_br$microregion_id
    df_s_br$spatial_id = df_s_br$microregion_id
    df_m_br$spatial_id = df_m_br$microregion_id
    df_s_ar$spatial_id = df_s_ar$county_id}
  
  if(spatial_unit=='mesoregion_id'){
    df_b_br$spatial_id = df_b_br$mesoregion_id
    df_s_br$spatial_id = df_s_br$mesoregion_id
    df_m_br$spatial_id = df_m_br$mesoregion_id
    df_s_ar$spatial_id = df_s_ar$county_id}
  
  if(spatial_unit=='state'){
    df_b_br$spatial_id = df_b_br$state
    df_s_br$spatial_id = df_s_br$state
    df_m_br$spatial_id = df_m_br$state
    df_s_ar$spatial_id = df_s_ar$state}
  
  var_list = c('year','country','commodity','product_type','spatial_id','state',
               'exporter','exporter_group',
               'equivalent_tonnes','fob_usd')
  
  df = rbind(df_b_br[destination!='BRAZIL', ..var_list],
             df_s_br[destination!='BRAZIL', ..var_list],
             df_m_br[destination!='BRAZIL', ..var_list],
             df_s_ar[id_type %in% id_type_list, ..var_list])
  
  exporter_drop_list = c('UNKNOWN')
  exporter_group_drop_list = c('DOMESTIC CONSUMPTION','UNKNOWN','PROCESSED DOMESTICALLY')
  state_drop_list = c("UNKNOWN STATE","UNKNOWN STATE","IMPORTS + STOCK")
  df_main = df[!is.na(spatial_id),][exporter %!in% exporter_drop_list,][exporter_group %!in% exporter_group_drop_list,][state %!in% state_drop_list,]
  
  #Nationwide shares
  df_x = df_main[, lapply(.SD, sum), by=.(year, country, commodity, exporter_group), .SDcols=c('equivalent_tonnes','fob_usd')]
  df_y = df_main[, lapply(.SD, sum), by=.(year, country, commodity), .SDcols=c('equivalent_tonnes','fob_usd')]
  df = merge(df_x, df_y, by=c('year','country','commodity'))
  df[, c('share_q','share_v') := list( equivalent_tonnes.x/equivalent_tonnes.y, fob_usd.x/fob_usd.y ) ]
  df_shares_nation = df[, list(year, country, commodity, exporter_group, share_q, share_v)]  %>% group_by(year, country, commodity) %>% top_n(3, share_q)
  
  #Local shares
  df_x = df_main[, lapply(.SD, sum), by=.(year, country, commodity, spatial_id, exporter_group), .SDcols=c('equivalent_tonnes','fob_usd')]
  df_y = df_main[, lapply(.SD, sum), by=.(year, country, commodity, spatial_id), .SDcols=c('equivalent_tonnes','fob_usd')]
  df = merge(df_x, df_y, by=c('year','country','commodity','spatial_id'))
  df[, c('share_q','share_v') := list( equivalent_tonnes.x/equivalent_tonnes.y, fob_usd.x/fob_usd.y ) ]
  df_shares = df[, list(year, country, commodity, spatial_id, exporter_group, share_q,share_v)]
  df_locations = unique(df_shares[,c('country','spatial_id')], by=c('country','spatial_id'))
  
  #Panel
  df1=complete(df_shares[commodity=='beef', c('year','exporter_group','spatial_id')], year, exporter_group, spatial_id)
  df1$commodity='beef'
  df2=complete(df_shares[commodity=='maize',c('year','exporter_group','spatial_id')], year, exporter_group, spatial_id)
  df2$commodity='maize'
  df3=complete(df_shares[commodity=='soybean' & country=='BRAZIL',c('year','exporter_group','spatial_id')], year, exporter_group, spatial_id)
  df3$commodity='soybean'
  df4=complete(df_shares[commodity=='soybean' & country=='ARGENTINA',c('year','exporter_group','spatial_id')], year, exporter_group, spatial_id)
  df4$commodity='soybean'
  df=rbind(df1,df2,df3,df4)
  
  df_combinations = merge(df, df_locations, by='spatial_id')
  df=setDT(merge(df_combinations, df_shares, by=c('year','commodity','country','spatial_id','exporter_group'), all.x=T))
  df[is.na(share_q),]$share_q = 0
  df[is.na(share_v),]$share_v = 0
  df$currently_present = 'no'
  df[share_q>0,]$currently_present = 'yes'
  df_panel = df
  
  df = df %>% dplyr::group_by(commodity,exporter_group,spatial_id) %>% dplyr::summarise(max = max(share_q), .groups = "drop_last")
  df$presence = 'never present'
  df[df$max>0,]$presence = 'present at least once'
  df = setDT(df)[,-c('max')]
  
  df_panel = merge(df_panel,df, by=c('commodity','spatial_id','exporter_group'))
  write.csv(df_panel, gzfile(paste0("../../data/agribusiness/clean/persistence_firms_",spatial_unit,".csv.gz")), row.names = FALSE)
}

#### Regressions

for(spatial_unit in c('amc_id','microregion_id','mesoregion_id','state')){
  
  df = fread(paste0("../../data/agribusiness/clean/persistence_firms_",spatial_unit,".csv.gz"))
  df$dummy_present = 1*(df$currently_present=='yes')
  df$id = paste0(df$country, df$commodity, df$spatial_id, df$exporter_group)
  pdata <- pdata.frame(df, index = c("id", "year"), drop.index = TRUE)
  
  #Shares
  m_share_q_p <- plm(share_q ~ lag(share_q, 1), data = pdata, model = "pooling")
  robust_vcov <- vcovHC(m_share_q_p, type = 'HC1')
  m_share_q_p$vcov <- robust_vcov
  
  #Presence
  m_presence_q_p <- plm(dummy_present ~ lag(dummy_present, 1), data = pdata, model = "pooling")
  robust_vcov <- vcovHC(m_presence_q_p, type = 'HC1')
  m_presence_q_p$vcov <- robust_vcov
  
  if(spatial_unit=='amc_id'){
    m_share_q_p_amc_id = m_share_q_p
    m_presence_q_p_amc_id = m_presence_q_p}
  
  if(spatial_unit=='microregion_id'){
    m_share_q_p_microregion_id = m_share_q_p
    m_presence_q_p_microregion_id = m_presence_q_p}
  
  if(spatial_unit=='mesoregion_id'){
    m_share_q_p_mesoregion_id = m_share_q_p
    m_presence_q_p_mesoregion_id = m_presence_q_p}
  
  if(spatial_unit=='state'){
    m_share_q_p_state = m_share_q_p
    m_presence_q_p_state = m_presence_q_p}
}

#### Summary table - shares

tab_title = paste0("Persistence of a firm's market share in an upstream market.")
rows <- tribble(~term,  ~OLS, ~OLS,~OLS,~OLS,
                'Upstream market definition', 'AMC/County','Mesoregion','Microregion','State/Province')
attr(rows, 'position') <- c(3)
gm <- tibble::tribble(~raw,        ~clean,          ~fmt,
                      "nobs",      "Observations",  0,
                      "r.squared", "R2", 3)
results_table_s <- msummary(list('AR(1)'=m_share_q_p_amc_id,'AR(1)'=m_share_q_p_microregion_id,'AR(1)'=m_share_q_p_mesoregion_id,'AR(1)'=m_share_q_p_state),
                            coef_rename =  c('$\\rho$'),
                            coef_omit = 'Intercept',
                            title = paste0('\\label{tab: agribusiness - persistence - shares}',tab_title),
                            estimate = "{estimate}{stars}", gof_map = gm, escape=F,
                            add_rows = rows,
                            output='latex')
kableExtra::save_kable(results_table_s, file = "../../output/tables/appendix/table2_firmshares.tex")

#### Summary table - presence

tab_title = paste0("Persistence of a firm's presence in an upstream market.")
rows <- tribble(~term,  ~OLS, ~OLS,~OLS,~OLS,
                'Upstream market definition', 'AMC/County','Mesoregion','Microregion','State/Province')
attr(rows, 'position') <- c(3)
gm <- tibble::tribble(~raw,        ~clean,          ~fmt,
                      "nobs",      "Observations",  0,
                      "r.squared", "R2", 3)
results_table_p <- msummary(list(m_presence_q_p_amc_id,m_presence_q_p_microregion_id,m_presence_q_p_mesoregion_id,m_presence_q_p_state),
                            coef_rename =  c('$\\rho$'),
                            coef_omit = 'Intercept',
                            title = paste0('\\label{tab: agribusiness - persistence - presence}',tab_title),
                            estimate = "{estimate}{stars}", gof_map = gm, escape=F,
                            add_rows = rows,
                            output='latex')
kableExtra::save_kable(results_table_p, file = "../../output/tables/appendix/table3_firmpresence.tex")


